Install and load R packages

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

if (!require("GEOquery", quietly = TRUE))
    BiocManager::install("GEOquery") 

if (!require("rScudo", quietly = TRUE))
    BiocManager::install("rScudo") 

library(tidyverse)
library(dplyr)
library(GEOquery)
library(rScudo)
library(plotly)

Data selection

The analysis will use the dataset GSE20437 obtained from GEO.The dataset is generated from Affymetrix HU133A microarrays and contains 42 tissue samples.

In detail, the data includes:

Note that sample numbers correspond to individual patient samples.

# download the GSE20437 expression data series
gse <- getGEO("GSE20437", destdir= './data/', getGPL = F)
## Found 1 file(s)
## GSE20437_series_matrix.txt.gz
## Using locally cached version: ./data//GSE20437_series_matrix.txt.gz
# getGEO returns a list of expression objects, but...
length(gse) 
## [1] 1
# shows us there is only one object in it. 
# We assign it to the same variable.
gse <- gse[[1]]

Exploratory analysis

# show what we have:
show(gse)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22283 features, 42 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM512539 GSM512540 ... GSM512580 (42 total)
##   varLabels: title geo_accession ... tissue:ch1 (38 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 20197764 
## Annotation: GPL96

The actual expression data are accessible in the exprs section of gse, an Expression Set and the generic data class that BioConductor uses for expression data.

head(exprs(gse)) 
##           GSM512539 GSM512540 GSM512541 GSM512542 GSM512543 GSM512544 GSM512545
## 1007_s_at    2461.4    3435.7    1932.5    2377.7    3055.3    2978.1    2348.5
## 1053_at        26.7     159.0      31.2     140.7      69.9      98.5      37.0
## 117_at         82.6     243.4     150.2      95.1     209.3     103.4      91.2
## 121_at        942.3     897.5     840.8     870.9     685.4     791.8     886.5
## 1255_g_at      71.8      87.9      75.4      58.1      31.8      40.3      70.5
## 1294_at       630.2     571.4     346.3     679.9    1289.3     421.1     417.6
##           GSM512546 GSM512547 GSM512548 GSM512549 GSM512550 GSM512551 GSM512552
## 1007_s_at    2963.9    2776.9    3088.9    3033.3    3037.1    3545.8    3322.6
## 1053_at        59.9      86.7     107.2      64.0      82.9      97.7      69.7
## 117_at        168.4     162.7     203.2     143.7     113.5      80.0     186.4
## 121_at        954.2     843.1     775.3     847.6     912.2     911.6     862.4
## 1255_g_at      43.3      51.6      42.6      74.9      53.7      30.5      15.2
## 1294_at       811.6     778.1     393.2     995.4     987.7     938.5     924.6
##           GSM512553 GSM512554 GSM512555 GSM512556 GSM512557 GSM512558 GSM512559
## 1007_s_at    1963.7    3609.6    2078.9    4138.6    4260.7    2453.6    2709.0
## 1053_at        82.0      45.6      84.5      31.7      37.4      82.4     204.8
## 117_at        106.6     145.6     144.4     133.6     278.6     173.0     147.8
## 121_at        705.0     984.6     853.8     846.8    1273.0     833.6     908.1
## 1255_g_at      42.5      76.6      88.2      90.6      65.8      25.8      77.5
## 1294_at       480.8    1054.1     632.0     448.0    1345.2    1248.9     405.7
##           GSM512560 GSM512561 GSM512562 GSM512563 GSM512564 GSM512565 GSM512566
## 1007_s_at    2612.5    4340.1    3155.3    2390.3    2738.8    3233.1    2836.6
## 1053_at       119.3      76.7     100.3     115.4      14.1      47.6      77.1
## 117_at        186.0     168.0      95.2      73.6     122.7     107.6     120.9
## 121_at        806.2     827.0     629.4     709.2     305.6     877.4     425.7
## 1255_g_at      84.3      87.9      44.6      59.3      12.0      82.1      59.2
## 1294_at       647.5    2218.1    1321.1     606.7    1709.9     980.8    1268.4
##           GSM512567 GSM512568 GSM512569 GSM512570 GSM512571 GSM512572 GSM512573
## 1007_s_at    2915.4    3457.5    2798.7    4370.2    2467.3    3669.5    3310.1
## 1053_at        47.1      47.0      83.2      40.2      80.3      24.1       8.8
## 117_at        143.4      92.5      72.1     131.8     156.4     165.8     141.6
## 121_at        643.8     771.3     681.1     812.7     533.4     746.9    1090.3
## 1255_g_at      62.2      28.3      97.6       8.1      17.9      53.0      39.9
## 1294_at       955.8    1157.5     888.6    1130.8     905.1    1138.5     483.0
##           GSM512574 GSM512575 GSM512576 GSM512577 GSM512578 GSM512579 GSM512580
## 1007_s_at    3942.2    4520.4    3596.1    2989.0    3164.5    2764.3    4258.5
## 1053_at        44.6      54.7      56.7      89.9      63.4      57.0      59.5
## 117_at         97.1     132.7     124.3     210.5     131.4      89.6     123.3
## 121_at       1008.7     718.6     988.4     295.9     957.3     630.8     869.2
## 1255_g_at      11.0      50.2      60.0      34.3      33.5      61.7      50.4
## 1294_at      1326.5    1179.4     668.3     863.2    1055.5    1287.6    1127.8

To conveniently access the data rows and columns present in exprs(gse), this matrix is assigned to its own variable ex.

# exprs (gse) is a matrix that we can assign to its own variable, to
# conveniently access the data rows and columns
ex <- exprs(gse)
dim(ex) # 42 sample, 22283 genes
## [1] 22283    42

The dataset contains gene expression data of 22283 genes (rows) from 42 patients (columns).

Pre-processing

# Analyze value distributions
#boxplot(ex)
boxplot(ex, main = 'Boxplot of the data before normalization')

I try to apply a log transformation to the data to obtain a boxplot easier to read.

ex2<-log(ex)
ex2 <- na.omit(as.matrix(ex2))
boxplot(ex2, main = 'Boxplot of the data after applying a logarithmic transformation')
Boxplot of the data after applying a logarithmic transformation

Boxplot of the data after applying a logarithmic transformation

From the boxplot after the log transformation, I can see that there is some variation in the median of the samples. So, I try to apply a median normalization to the data after the log transformation.

# MEDIAN NORMALIZATION
channel.medians=apply(log(ex),2,median)
normalized.log.ex=sweep(log(ex),2,channel.medians,"-")

# boxplot post median normalization on ex
boxplot(normalized.log.ex, main = 'Boxplot of the data after median normalization')
Boxplot of the data after median normalization

Boxplot of the data after median normalization

PCA

pca <- prcomp(t(normalized.log.ex))
summary(pca) # get the summary of the PCA, we get a table in which each column is a principal components and we have information on the standard deviation and the variation, etc
## Importance of components:
##                            PC1      PC2      PC3      PC4      PC5      PC6
## Standard deviation     38.8020 23.47065 20.63892 19.54327 18.43330 15.97431
## Proportion of Variance  0.1791  0.06552  0.05067  0.04543  0.04042  0.03035
## Cumulative Proportion   0.1791  0.24461  0.29527  0.34070  0.38112  0.41147
##                             PC7      PC8      PC9     PC10     PC11     PC12
## Standard deviation     15.54787 15.25339 14.95988 13.82648 13.67761 13.63388
## Proportion of Variance  0.02875  0.02767  0.02662  0.02274  0.02225  0.02211
## Cumulative Proportion   0.44023  0.46790  0.49452  0.51726  0.53951  0.56162
##                            PC13     PC14     PC15     PC16     PC17     PC18
## Standard deviation     13.39954 13.05683 12.96288 12.78929 12.64910 12.48152
## Proportion of Variance  0.02136  0.02028  0.01999  0.01946  0.01903  0.01853
## Cumulative Proportion   0.58298  0.60326  0.62324  0.64270  0.66173  0.68026
##                            PC19     PC20     PC21     PC22     PC23     PC24
## Standard deviation     12.32645 12.13869 12.04722 11.97088 11.90374 11.69483
## Proportion of Variance  0.01807  0.01753  0.01726  0.01705  0.01685  0.01627
## Cumulative Proportion   0.69833  0.71586  0.73312  0.75017  0.76702  0.78329
##                            PC25     PC26     PC27     PC28     PC29     PC30
## Standard deviation     11.60734 11.57222 11.46507 11.15870 11.04915 10.99769
## Proportion of Variance  0.01603  0.01593  0.01564  0.01481  0.01452  0.01439
## Cumulative Proportion   0.79932  0.81525  0.83088  0.84569  0.86021  0.87460
##                            PC31     PC32     PC33     PC34   PC35    PC36
## Standard deviation     10.68517 10.45191 10.25430 10.13818 9.9583 9.68456
## Proportion of Variance  0.01358  0.01299  0.01251  0.01223 0.0118 0.01116
## Cumulative Proportion   0.88818  0.90117  0.91368  0.92591 0.9377 0.94886
##                           PC37    PC38   PC39    PC40    PC41      PC42
## Standard deviation     9.50070 9.42259 9.3053 9.17335 8.95347 6.784e-14
## Proportion of Variance 0.01074 0.01056 0.0103 0.01001 0.00954 0.000e+00
## Cumulative Proportion  0.95960 0.97016 0.9805 0.99046 1.00000 1.000e+00
screeplot(pca) # this plot show the variance explained by the first 10 components
Variance explained by the first 10 components

Variance explained by the first 10 components

# draw PCA plot
group <- c(rep("cadetblue1",18), rep("red",18), rep("purple",6) ) # vector of colors based on the order of my data
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)

The vector group used in the PCA plot is based on the data. The samples corresponding to the colors are the following:

  • Light blue: reduction mammoplasty (RM) breast epithelium samples

  • Red: histologically normal (HN) epithelial samples from breast cancer patient

  • Purple: histologically normal breast epithelium (NlEpi) from prophylactic mastectomy patient samples

Let’s try to explore an interactive PCA plot.

components<-pca[["x"]]
components<-data.frame(components)
type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
components<-cbind(components, type )

fig <- plot_ly(components, x=~PC1, y=~PC2, 
               color=type,colors=c('cadetblue1', 'red','purple'), 
               type='scatter',mode='markers')
fig
fig2 <- plot_ly(components, x=~PC1, y=~PC2, z=~PC3, 
                color=type, colors=c('cadetblue1', 'red','purple'),
                mode='markers', marker = list(size = 4))
fig2
fig3 <- plot_ly(components, x=~PC1, y=~PC3, 
                color=type, colors=c('cadetblue1', 'red','purple'),
                type='scatter',mode='markers')
fig3

UMAP

umap_data <- umap::umap(t(normalized.log.ex), 
                        n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),#or the square root of the rows 
                        min_dist = 0.1,         
                        metric = "euclidean",    #you can change it
                        n_components = 2) #used the default ones! 

umap_df <- data.frame(umap_data$layout) %>% tibble::rownames_to_column('Samples')
umap_df$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))

library(plotly)

figUmap <- plot_ly(umap_df, 
                 x = ~X1, y = ~X2, color = umap_df$type,
                 colors = c('cadetblue1', 'red','purple'),   
                 mode = 'markers',
                 size=1)

# Display the 2D scatter plot
figUmap
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
umap_data_3D <- umap::umap(t(normalized.log.ex), 
                        n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),
                      #or the square root of the rows, provato anche con 5 fa pena, anche 15 abbastanza pena  
                        min_dist = 0.1,         
                        metric = "euclidean",    #you can change it
                        n_components = 3) #used the default ones! 

umap_df_3D <- data.frame(umap_data_3D$layout) %>% tibble::rownames_to_column('Samples')
umap_df_3D$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))

figUmap_3D <- plot_ly(umap_df_3D, 
                 x = ~X1, y = ~X2, z=~X3, color = umap_df_3D$type,
                 colors = c('cadetblue1', 'red','purple'),   
                 mode = 'markers',
                 size=1) %>% layout(title = 'Umap 1 control-tumors 3D')

# Display the 2D scatter plot
figUmap_3D
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d

```